library(mdsr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.0      ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
data("CIACountries")

Scatterplots

Base R

plot(CIACountries$gdp, 
     CIACountries$educ)

Add axis labels

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education")

Change the shape

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16)

Change the color

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red")

Change the x and y limits

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red", 
     xlim = c(0,150000), 
     ylim = c(0,15))

Change the x and y limits

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red", 
     xlim = c(0,150000), 
     ylim = c(0,15))

Aspect Ratio

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red", 
     xlim = c(0,150000), 
     ylim = c(0,15), 
     asp = 10000)

Change axis labels

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red", 
     xlim = c(0,150000), 
     ylim = c(0,15), 
     asp = 10000, 
     xaxt = 'n', 
     yaxt = 'n')

axis(1, c(0,100000,200000), c("None","A little","A lot"))
axis(2, c(0,5,10,15), c("None","Ok","Good","Great"))

Size

plot(CIACountries$gdp, 
     CIACountries$educ, 
     xlab = "GDP",
     ylab = "Education",
     pch = 16, 
     col = "red", 
     xlim = c(0,150000), 
     ylim = c(0,15), 
     asp = 10000, 
     xaxt = 'n', 
     yaxt = 'n', 
     cex = 0.5)

axis(1, c(0,100000,200000), c("None","A little","A lot"))
axis(2, c(0,5,10,15), c("None","Ok","Good","Great"))

ggplot

The key to using ggplot to is to think of ever command as a layer!

library(ggplot2)
ggplot(aes(x = gdp, y = educ),data = CIACountries) + geom_point()
## Warning: Removed 64 rows containing missing values (geom_point).

#CIACountries %>% ggplot(aes(x = gdp, y = educ)) + geom_point()

Add axis labels

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point() + 
  xlab("GDP") + 
  ylab("Education")
## Warning: Removed 64 rows containing missing values (geom_point).

Change the shape

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23) +
  xlab("GDP") +
  ylab("Education")
## Warning: Removed 64 rows containing missing values (geom_point).

Change the color

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23, colour = "red") +
  xlab("GDP") +
  ylab("Education") 
## Warning: Removed 64 rows containing missing values (geom_point).

Change the x and y limits

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23, colour = "red") +
  xlab("GDP") +
  ylab("Education") + 
  xlim(0,200000) + 
  ylim(0,15)
## Warning: Removed 64 rows containing missing values (geom_point).

Aspect Ratio

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23, colour = "red") +
  xlab("GDP") +
  ylab("Education") + 
  xlim(0,200000) + 
  ylim(0,15) + 
  coord_fixed(ratio=10000)
## Warning: Removed 64 rows containing missing values (geom_point).

Change axis values

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23, colour = "red") +
  xlab("GDP") +
  ylab("Education") + 
  xlim(0,200000) + 
  ylim(0,15) + 
  coord_fixed(ratio=10000) + 
  scale_x_continuous(breaks = c(0,100000,200000),
                     labels = c("None","A little","A lot"), 
                     limits = c(0,200000)) + 
  scale_y_continuous(breaks = c(0,5,10,15),
                     labels = c("None","Ok","Good","Great"),
                     limits = c(0,15)
                     )
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 64 rows containing missing values (geom_point).

#Note: scale_x_continuous overrules the xlim!

Size

ggplot(aes(x = gdp, y = educ), data = CIACountries) +
  geom_point(shape = 23, colour = "red", size = 0.5) +
  xlab("GDP") +
  ylab("Education") + 
  xlim(0,200000) + 
  ylim(0,15) + 
  coord_fixed(ratio=10000) + 
  scale_x_continuous(breaks = c(0,100000,200000),
                     labels = c("None","A little","A lot"), 
                     limits = c(0,200000)) + 
  scale_y_continuous(breaks = c(0,5,10,15),
                     labels = c("None","Ok","Good","Great"),
                     limits = c(0,15)
                     )
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 64 rows containing missing values (geom_point).

Univariate displays

Histograms

library(mdsr)
data("SAT_2010")
#Base R
hist(SAT_2010$math)

#ggplot
ggplot(aes(x = math), data = SAT_2010) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#gggplot with breaks
ggplot(aes(x = math), data = SAT_2010) + geom_histogram(breaks = c(400,450,550,600,700))

#Base R
dens <- density(SAT_2010$math)
plot(dens$x, dens$y, type = "l")

#ggplot
#This is a lot cleaner code and nicer looking!
ggplot(aes(x = math), data = SAT_2010) + geom_density()

#More jagged
ggplot(aes(x = math), data = SAT_2010) + geom_density(bw = 0.01)

sub <- head(SAT_2010, 10)
sub <- sub[order(sub$math),]
barplot(sub$math, names.arg = sub$state)

#ggplot
ggplot(
  data = head(SAT_2010, 10), 
  aes(x = reorder(state, math), y = math)
) +
  geom_col() +
  labs(x = "State", y = "Average math SAT score")

### Stacked bar chart

library(mosaicData)
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") 

ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") 

ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") + 
  coord_flip()

Multivariate displays

g <- ggplot(
  data = SAT_2010, 
  aes(x = expenditure, y = math)
) + 
  geom_point()

We can easily add a trend line with ggplot.

g + 
  geom_smooth(method = "loess", se = FALSE) + 
  xlab("Average expenditure per student ($1000)") +
  ylab("Average score on math SAT")
## `geom_smooth()` using formula 'y ~ x'

To do this in base R you have to do this:

plot(SAT_2010$expenditure, SAT_2010$math)
a <- loess(SAT_2010$math ~ SAT_2010$expenditure)
points(a$x[order(a$x)], a$fitted[order(a$x)], type = "l", col = "red")

SAT_2010 <- SAT_2010 %>%
  mutate(
    SAT_rate = cut(
      sat_pct, 
      breaks = c(0, 30, 60, 100), 
      labels = c("low", "medium", "high")
    )
  )
#g <- g %+% SAT_2010
#g + aes(color = SAT_rate)
#To this in base R is possible, but it is a pain!


#Full code: 
ggplot(data = SAT_2010, aes(x = expenditure, y = math)) + 
  geom_point(aes(color = SAT_rate)) + geom_smooth(aes(color = SAT_rate), method = "lm", se = FALSE) + scale_colour_brewer(palette = "Spectral") 
## `geom_smooth()` using formula 'y ~ x'

Faceting

#Base R
par(mfrow = c(1, 3))

plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "low"],
     SAT_2010$math[SAT_2010$SAT_rate == "low"],
     xlab = "expenditure",
     ylab = "math score")

plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "medium"],
     SAT_2010$math[SAT_2010$SAT_rate == "medium"],
     xlab = "expenditure",
     ylab = "math score")

plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "high"],
     SAT_2010$math[SAT_2010$SAT_rate == "high"],
     xlab = "expenditure",
     ylab = "math score")

So much easier in ggplot!

#g + facet_wrap( ~SAT_rate )
#g + facet_grid(~ SAT_rate )
#full code
SAT_2010 <- SAT_2010 %>%
  mutate(SAT_rate = cut(
    sat_pct,
    breaks = c(0, 30, 60, 100),
    labels = c("low", "medium", "high")
  ))
g <- ggplot(data = SAT_2010,
            aes(x = expenditure, y = math)) +
  geom_point() + geom_smooth(method = "lm") + facet_wrap( ~ SAT_rate)
g
## `geom_smooth()` using formula 'y ~ x'

NHANES example

library(NHANES)
ggplot(
  data = slice_sample(NHANES, n = 1000), 
  aes(x = Age, y = Height, color = Gender)
) + 
  geom_point() + 
  geom_smooth() + 
  xlab("Age (years)") + 
  ylab("Height (cm)") +
  labs(color = "Gender")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 38 rows containing non-finite values (stat_smooth).
## Warning: Removed 38 rows containing missing values (geom_point).

#relevel the reference category
library(NHANES)
ggplot(
  data = slice_sample(NHANES, n = 1000), 
  aes(x = Age, y = Height, color = fct_relevel(Gender, "male"))
) + 
  geom_point() + 
  geom_smooth() + 
  xlab("Age (years)") + 
  ylab("Height (cm)") +
  labs(color = "Gender")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (stat_smooth).
## Warning: Removed 39 rows containing missing values (geom_point).

library(macleish)
## Loading required package: etl
ggplot(data = whately_2015, aes(x = when, y = temperature)) + 
  geom_line(color = "darkgray") + 
  geom_smooth() + 
  xlab(NULL) + 
  ylab("Temperature (degrees Celsius)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# whately_2015 %>%
#   mutate(month = as.factor(lubridate::month(when, label = TRUE))) %>%
#   group_by(month) %>% 
#   skim(temperature) %>%
#   select(-na)

whately_2015$month <-
  as.factor(lubridate::month(whately_2015$when, label = TRUE))


#Base R
boxplot(
  whately_2015$temperature ~ whately_2015$month,
  xlab = "Month",
  ylab = expression("Temperature (" *  ~ degree * C * ")")
)

#ggplot
ggplot(
  data = whately_2015, 
  aes(
    x = lubridate::month(when, label = TRUE), 
    y = temperature
  )
) + 
  geom_boxplot() +
  xlab("Month") + 
  ylab(expression("Temperature ("*~degree*C*")"))

Check out the extended example here: https://mdsr-book.github.io/mdsr2e/ch-vizII.html#sec:babynames

In class exercise

Make these plots!

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'